Explore trip time, trip distance, and tip percentage
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.rcParams["figure.figsize"] = (10,10)
In [2]:
fares_data_file = '../../trip_fare/trip_fare_2.csv'
trips_data_file = '../../trip_data/trip_data_2.csv'
In [3]:
fares_pd = pd.read_csv(fares_data_file, usecols=[0, 1, 3, 4, 5, 8, 10])
fares_old_columns = fares_pd.columns
fares_pd_columns = [header.strip() for header in fares_old_columns]
fares_pd.columns = fares_pd_columns
print fares_pd_columns
fares_pd.describe()
Out[3]:
In [4]:
fares_pd.head()
Out[4]:
In [5]:
fares_short_pd = fares_pd
fares_short_pd['medallion'] = fares_pd['medallion'].apply(lambda x: x[:8])
fares_short_pd['hack_license'] = fares_pd['hack_license'].apply(lambda x: x[:8])
del fares_pd
In [6]:
fares_short_pd.to_csv('../data/trip_fare_short_2.csv')
del fares_short_pd
In [7]:
trips_pd = pd.read_csv(trips_data_file, usecols=[1]+range(5, 14))
trips_old_columns = trips_pd.columns
trips_pd_columns = [header.strip() for header in trips_old_columns]
trips_pd.columns = trips_pd_columns
print trips_pd_columns
trips_pd.describe()
Out[7]:
In [8]:
trips_short_pd = trips_pd
trips_short_pd['hack_license'] = trips_pd['hack_license'].apply(lambda x: x[:8])
del trips_pd
In [9]:
trips_short_pd.to_csv('../data/trip_data_short_2.csv')
del trips_short_pd
In [10]:
fares_data_file = '../data/trip_fare_short_2.csv'
trips_data_file = '../data/trip_data_short_2.csv'
fares_pd = pd.read_csv(fares_data_file)
trips_pd = pd.read_csv(trips_data_file)
taxi_pd = pd.merge(fares_pd, trips_pd, how='outer')
del fares_pd, trips_pd
taxi_pd.head()
Out[10]:
In [11]:
taxi_pd.to_csv('../data/taxi_short_2.csv')
del taxi_pd
In [ ]:
In [12]:
taxi_pd = pd.read_csv('../data/taxi_short_2.csv')
taxi_pd.describe()
Out[12]:
In [19]:
#
plt.figure(figsize=(8,4.95))
plt.hist(taxi_pd['tip_amount'], bins=200, range=(0,20))
plt.show()
In [17]:
taxi_pd['tip_frac'] = (100.*taxi_pd.tip_amount \
/(taxi_pd.total_amount - taxi_pd.tip_amount))
taxi_pd['tip_frac'].describe()
Out[17]:
In [21]:
#
plt.figure(figsize=(8,4.95))
plt.hist(taxi_pd['tip_frac'], bins=200, range=(0,50));
In [22]:
#NYW Lat and Longitude
center_lat = 40.76
center_lng = -73.925
dlat = 0.125
dlng = 0.125
In [24]:
taxi_filter_pd = pd.DataFrame(taxi_pd[(taxi_pd.trip_distance <= 50) &
(taxi_pd.trip_distance > 0.1) &
(taxi_pd.trip_time_in_secs > 1) &
(taxi_pd.fare_amount < 50) &
(taxi_pd.fare_amount*2 % 1 == 0) &
(taxi_pd.tip_frac >= 0.1) &
(taxi_pd.tip_frac < 50) &
(taxi_pd.pickup_latitude > center_lat - dlat) &
(taxi_pd.pickup_latitude < center_lat + dlat) &
(taxi_pd.pickup_longitude > center_lng - dlng) &
(taxi_pd.pickup_longitude < center_lng + dlng)])
del taxi_pd
In [25]:
# zero and autotipping issue
plt.figure(figsize=(8,4.95))
plt.hist(taxi_filter_pd['tip_frac'], bins=200, range=(0,50));
In [26]:
pickups = taxi_filter_pd[['pickup_longitude', 'pickup_latitude']].values
plt.figure(figsize=(8, 4.95))
plt.rcParams.update({'font.size': 14})
plt.scatter(pickups[:1000,0], pickups[:1000,1])
plt.show()
In [27]:
dist_max = 12.
time_max = 35.*60.
tip_frac_max = 30.
ndistbins = 15
ntimebins = 15
ntipbins = 30
trip_dist_bins = np.linspace(0, dist_max, ndistbins)
trip_time_bins = np.linspace(0, time_max, ntimebins)
tip_frac_bins = np.linspace(0, tip_frac_max, ntipbins)
In [28]:
cdist = pd.cut(taxi_filter_pd.trip_distance.values, trip_dist_bins)
ctime = pd.cut(taxi_filter_pd.trip_time_in_secs.values, trip_time_bins)
ctip = pd.cut(taxi_filter_pd.tip_frac.values, tip_frac_bins)
In [29]:
cdist
Out[29]:
In [32]:
s_tipfrac = pd.Series(taxi_filter_pd.tip_frac)
In [30]:
cdist.codes
Out[30]:
In [31]:
ctip.codes
Out[31]:
In [33]:
gb_dist_tip_cnts = s_tipfrac.groupby([cdist.codes, ctip.codes]).count()
gb_tip_time_cnts = s_tipfrac.groupby([ctip.codes, ctime.codes]).count()
gb_dist_time_mean = s_tipfrac.groupby([cdist.codes, ctime.codes]).mean()
In [34]:
mi1 = pd.MultiIndex.from_product([range(-1, ndistbins-1), range(-1, ntipbins-1)], names=['dist', 'tip'])
mi2 = pd.MultiIndex.from_product([range(-1, ntipbins-1), range(-1, ntimebins-1)], names=['tip', 'time'])
mi3 = pd.MultiIndex.from_product([range(-1, ndistbins-1), range(-1, ntimebins-1)], names=['dist', 'time'])
gb_dist_tip_cnts_ri = gb_dist_tip_cnts.reindex(mi1).fillna(0)
gb_tip_time_cnts_ri = gb_tip_time_cnts.reindex(mi2).fillna(0)
gb_dist_time_mean_ri = gb_dist_time_mean.reindex(mi3).fillna(0)
In [41]:
plt.figure(figsize=(20, 10))
plt.rcParams.update({'font.size': 24})
#frequency of tip percentage vs. distance
plt.subplot(1, 2, 1)
extent = [trip_dist_bins[0], trip_dist_bins[-1], tip_frac_bins[0], tip_frac_bins[-1]]
plt.imshow(np.log10(np.reshape(gb_dist_tip_cnts_ri.values, (ndistbins, ntipbins)).T[1:, 1:]),
origin='lower', extent=extent, aspect=dist_max/tip_frac_max, interpolation='none', vmin=0, vmax=6,
cmap="RdYlBu")
plt.xlabel('trip distance (miles)')
plt.ylabel('tip percentage')
cbar = plt.colorbar(shrink=0.6, ticks=[0, 1, 2, 3, 4, 5, 6])
cbar.set_label('number of fares', rotation=270, labelpad=5)
cbar.set_ticklabels([r'$\leq1$', r'$10^1$', r'$10^2$', r'$10^3$', r'$10^4$', r'$10^5$', r'$\geq10^6$'])
plt.subplot(1, 2, 2)
extent = [tip_frac_bins[0], tip_frac_bins[-1], trip_time_bins[0]/60., trip_time_bins[-1]/60.]
plt.imshow(np.log10(np.reshape(gb_tip_time_cnts_ri.values, (ntipbins, ntimebins)).T[1:, 1:]),
origin='lower', extent=extent, aspect=tip_frac_max/(time_max/60.), interpolation='none', vmin=0, vmax=6,
cmap="RdYlBu")
plt.xlabel('tip percentage')
plt.ylabel('trip time (minutes)')
cbar = plt.colorbar(shrink=0.6, ticks=[0, 1, 2, 3, 4, 5, 6])
cbar.set_label('number of fares', rotation=270, labelpad=5)
cbar.set_ticklabels([r'$\leq1$', r'$10^1$', r'$10^2$', r'$10^3$', r'$10^4$', r'$10^5$', r'$\geq10^6$'])
plt.tight_layout()
plt.savefig('number_of_fares.png')
plt.show()
In [49]:
plt.figure(figsize=(12, 12))
plt.rcParams.update({'font.size': 24})
#mean tip percentage vs. distance and time
#subplot(2, 2, 3)
extent = [trip_dist_bins[0], trip_dist_bins[-1], trip_time_bins[0]/60., trip_time_bins[-1]/60.]
plt.imshow(np.reshape(gb_dist_time_mean_ri.values, (ndistbins, ntimebins)).T[1:, 1:],
origin='lower', extent=extent, aspect=dist_max/(time_max/60.), interpolation='None', vmin=15, vmax=20,
cmap="RdYlBu")
plt.xlabel('trip distance (miles)')
plt.ylabel('trip time (minutes)')
cbar = plt.colorbar(shrink=0.6, ticks=[15, 16, 17, 18, 19, 20])
cbar.set_label('mean tip percentage', rotation=270, labelpad=13)
cbar.set_ticklabels([r'$\leq$15', '16', '17', '18', '19', r'$\geq$20'])
#plt.tight_layout(h_pad=-6, w_pad=2)
# plt.savefig('dist_time_auto_tip.png')
plt.show()
In [50]:
center_lat = 40.76
center_lng = -73.925
dlat = 0.125
dlng = 0.125
nlatbins = 100
nlngbins = 100
def drop_mean(ary):
if ary.size < 50:
return 0.
return np.mean(ary)
In [52]:
lng_bins = np.linspace(center_lng - dlng, center_lng + dlng, nlngbins)
lat_bins = np.linspace(center_lat - dlat, center_lat + dlat, nlatbins)
clng = pd.cut(taxi_filter_pd.pickup_longitude.values, lng_bins)
clat = pd.cut(taxi_filter_pd.pickup_latitude.values, lat_bins)
s_tipfrac = pd.Series(taxi_filter_pd.tip_frac)
gb_mean = s_tipfrac.groupby([clng.codes, clat.codes]).apply(drop_mean)
gb_cnts = s_tipfrac.groupby([clng.codes, clat.codes]).count()
mi = pd.MultiIndex.from_product([range(-1, nlngbins-1), range(-1, nlatbins-1)], names=['lng', 'lat'])
gb_mean_ri = gb_mean.reindex(mi).fillna(0)
gb_cnts_ri = gb_cnts.reindex(mi).fillna(0)
In [53]:
plt.figure(figsize=(12, 12))
plt.rcParams.update({'font.size': 20})
#plt.subplot(1, 2, 1)
weighted_coord_bins_cnts = np.reshape(gb_cnts_ri.values, (nlngbins, nlatbins))
extent = [lng_bins[0], lng_bins[-1], lat_bins[0], lat_bins[-1]]
plt.imshow(np.log10(weighted_coord_bins_cnts.T[1:, 1:] + 0.1), interpolation='bicubic', origin='lower',
extent=extent, vmin=0, vmax=4.5, cmap="RdYlBu")
plt.xlabel('longitude (degrees)', size=14)
plt.ylabel('latitude (degrees)', size=14)
cbar = plt.colorbar(shrink=0.7, ticks=[0, 1, 2, 3, 4], pad=0.025)
cbar.set_label('number of fares', rotation=270, labelpad=13, size=14)
cbar.set_ticklabels([r'$\leq1$', r'$10^1$', r'$10^2$', r'$10^3$', r'$\geq10^4$'])